import numpy as np
import pandas as pd
import re
df =pd.read_excel('a.xlsx')
pd.set_option('display.unicode.east_asian_width',True)
def dealYear(year):
    num = year
    if type(year) == str:
        num =2024- int(year)
    return num
def dealType(ser):
    data = np.zeros((len(ser),) , dtype='int')
    df = pd.DataFrame({'室':data,'厅':data})
    for i in ser.index:
        if ser[i] !='车位':
            rec =re.findall(r'\d+',ser[i])
            df.loc[i,'室']=int(rec[0])
            df.loc[i,'厅'] = int(rec[1])
    return df

df['户型']=df['户型'].str.replace('房间','室')
df = df.join(dealType(df['户型']))

df['年份']=df['年份'].str.replace('年建','').apply(lambda x:dealYear(x))
df['面积']=df['面积'].str.replace('平米','').astype('float')
df['总价']=df['总价'].str.replace('万','').astype('float')
df['单价']=df['单价'].str.replace(',','').str.replace('元/平','').astype('float')
df=df.rename({'面积':'面积(平方米)','年份':'房龄','总价':'总价(万元)','单价':'单价(元/平方米)'},axis='columns')
print(df[['总价(万元)','面积(平方米)','房龄','单价(元/平方米)','室','厅']])

df1= df[df['户型']=='车位']
print(df1)
print(len(df))
df=df.drop(df1.index)
print(len(df))


df2=df['房龄'][(df['房龄']<0)|(df['房龄']>50)]
print(df2)
print(len(df))
df=df.drop(df2.index)
print(len(df))

df3=df.duplicated(keep=False)
print(df[df3==True])
print(len(df))
df=df.drop_duplicates()
print(len(df))

print(len(df))
df=df.dropna(subset=['房龄'])
print(len(df))
df=df.fillna({'方源标签':''})